{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "We show here how to use the retrieval completion function to add context from documents to any OpenAI Evals task\n",
    "The toy example here will be to augment our Born-First task with a dataset of presidential birthdays\n",
    "\"\"\"\n",
    "\n",
    "# Download the dataset manually, or use curl\n",
    "!curl -O https://people.math.sc.edu/Burkardt/datasets/presidents/president_birthdays.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from openai import OpenAI\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"president_birthdays.csv\").rename(columns={\" \\\"Name\\\"\": \"Name\", \" \\\"Month\\\"\": \"Month\", \" \\\"Day\\\"\": \"Day\", \" \\\"Year\\\"\": \"Year\"}).set_index(\"Index\")\n",
    "df[\"text\"] = df.apply(lambda r: f\"{r['Name']} was born on {r['Month']}/{r['Day']}/{r['Year']}\", axis=1)\n",
    "display(df.head())\n",
    "\n",
    "def embed(text):\n",
    "    return OpenAI(api_key=os.environ.get(\"OPENAI_API_KEY\")).embeddings.create(\n",
    "            model=\"text-embedding-ada-002\",\n",
    "            input=text\n",
    "        ).data[0].embedding\n",
    "\n",
    "df[\"embedding\"] = df['text'].apply(embed)\n",
    "df[[\"text\", \"embedding\"]].to_csv(\"presidents_embeddings.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "We create a registry entry here in code. Notice we set number of retrieved documents k=2.\n",
    "\"\"\"\n",
    "\n",
    "registry_yaml = f\"\"\"\n",
    "retrieval/presidents/gpt-3.5-turbo:\n",
    "  class: evals.completion_fns.retrieval:RetrievalCompletionFn\n",
    "  args:\n",
    "    completion_fn: gpt-3.5-turbo\n",
    "    embeddings_and_text_path: {os.path.abspath('presidents_embeddings.csv')}\n",
    "    k: 2\n",
    "\n",
    "retrieval/presidents/cot/gpt-3.5-turbo:\n",
    "  class: evals.completion_fns.retrieval:RetrievalCompletionFn\n",
    "  args:\n",
    "    completion_fn: cot/gpt-3.5-turbo\n",
    "    embeddings_and_text_path: {os.path.abspath('presidents_embeddings.csv')}\n",
    "    k: 2\n",
    "\"\"\".strip()\n",
    "\n",
    "# Replace with path to your registry\n",
    "os.makedirs(\"completion_fns\", exist_ok=True)\n",
    "with open(\"completion_fns/retrieval.yaml\", \"w\") as f:\n",
    "    f.write(registry_yaml)\n",
    "\n",
    "# GPT-3.5-turbo base: accuracy 0.7\n",
    "!oaieval gpt-3.5-turbo born-first --max_samples 10 --registry_path .\n",
    "\n",
    "# GPT-3.5-turbo with retrieval: accuracy 0.9 -> The failure mode here is the retrieved president is incorrect: Andrew Johnson vs Andrew Jackson\n",
    "!oaieval retrieval/presidents/gpt-3.5-turbo born-first --max_samples 10 --registry_path .\n",
    "\n",
    "# GPT-3.5-turbo with retrieval and chain-of-thought: accuracy 1.0\n",
    "!oaieval retrieval/presidents/cot/gpt-3.5-turbo born-first --max_samples 10 --registry_path ."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
